// .........................................................................
// Title: industry_master.do
//
// Constructs industry files at the cusip6 level by merging data from 
// multiple sources
// .........................................................................

* ---------------------------------------------------------------------------------------------------------
* Industry data from Dealogic
* ---------------------------------------------------------------------------------------------------------

* Firm primary SIC codes
use "$raw/dealogic/CompanySICCodes", clear
keep if isprimary == "True"
save "$tmp/dlg_primary_sic_codes", replace

* Dealogic SIC codes at cusip6 level
use "$tmp/dealogic_dcm_issuance_complete", clear
gen issuer_number = substr(cusip, 1, 6) 
mmerge issuer_number using "$raw/cmns/cmns_aggregation", unmatched(m) 
mmerge issuerid using "$tmp/dlg_primary_sic_codes", unmatched(m) umatch(companyid)
keep issuer_number code
duplicates drop
drop if missing(code)
drop if missing(issuer_number)
bysort issuer_number: keep if _n == 1
rename issuer_number cusip6
rename code dlg_sic
drop if strlen(cusip6) < 6
save "$tmp/industry_dlg", replace

* ---------------------------------------------------------------------------------------------------------
* Industry data from Factset
* ---------------------------------------------------------------------------------------------------------

use "$raw/factset/sym_sec_entity", clear
mmerge fsym_id using "$raw/factset/sym_cusip", unmatched(m)
drop if missing(cusip)
drop _merge
gen issuer_number = substr(cusip, 1, 6)
keep factset_entity_id issuer_number
duplicates drop
save "$tmp/factset_entity_id_to_cusip6", replace

use "$raw/factset/ent_entity_naics_rank", clear
keep if rank == 1
drop rank
save "$tmp/ent_entity_naics_primary", replace

use "$tmp/factset_entity_id_to_cusip6", clear
mmerge factset_entity_id using  "$tmp/ent_entity_naics_primary", unmatched(m)
drop if missing(naics_code)
bysort issuer_number naics_code: gen cusipNaicsCount = _N
bysort issuer_number: egen max_cusipNaicsCount = max(cusipNaicsCount)
keep if cusipNaicsCount == max_cusipNaicsCount
bys issuer_number: keep if _n == 1
keep issuer_number naics_code
save "$tmp/factset_cusip6_to_naics", replace

use "$tmp/factset_cusip6_to_naics", clear
rename naics_code factset_naics
rename issuer_number cusip6
save "$tmp/industry_factset", replace

* ---------------------------------------------------------------------------------------------------------
* Crosswalks
* ---------------------------------------------------------------------------------------------------------

* SIC-NAICS crosswalk
tempfile sic_naics
tempfile sic_naics_hundreds
tempfile sic_naics_decimal
import delimited using "$raw/industry_codes/sic_naics_crosswalk.csv", clear
rename ïnaicscode naics
rename siccode sic
destring sic, force replace
drop if missing(sic)
gsort -sic -naics
bysort sic: keep if _n ==1
save `sic_naics', replace
use `sic_naics', clear
gen _sic = floor(sic/100)*100
bysort _sic: keep if _n == 1
drop sic
rename _sic sic
save `sic_naics_hundreds', replace
use `sic_naics', clear
gen _sic = floor(sic/10)*10
bysort _sic: keep if _n == 1
drop sic
rename _sic sic
save `sic_naics_decimal', replace
use `sic_naics'
append using `sic_naics_hundreds'
append using `sic_naics_decimal'
bysort sic: keep if _n ==1
save "$tmp/sic_naics_crosswalk", replace

* NAICS-SIC crosswalk
import delimited using "$raw/industry_codes/sic_naics_crosswalk.csv", clear
rename ïnaicscode naics
rename siccode sic
drop if missing(sic) | missing(naics)
gsort -naics -sic 
bysort naics: keep if _n ==1
save "$tmp/naics_sic_crosswalk", replace

* NAICS structure
import excel using "$raw/industry_codes/naics_structure.xlsx", clear firstrow cellrange(A3)
keep NAICSCode NAICSTitle
rename NAICSCode naics
rename NAICSTitle naics_description
destring naics, force replace
drop if missing(naics)
save "$tmp/naics_structure", replace

* GICS codes structure
import excel "$raw/industry_codes/gics_structure.xls", sheet("Effective close of Aug 31,2016") clear
keep E F
rename E gics6_num
rename F gics6_name
drop if _n<5
drop if gics6_num==""
destring gics6_num, replace
duplicates drop
gen simplegics6id =_n
labmask simplegics6id, values(gics6_name)
gen firm_type = 2
*above 67 for real estate
replace firm_type=1 if (simplegics6id>=46 & simplegics6id<=52) | simplegics6id>=67
save "$tmp/gics6_industries", replace	

* ---------------------------------------------------------------------------------------------------------
* Industry data from Morningstar
* ---------------------------------------------------------------------------------------------------------

use "$tmp/Internal_Industry_NonUS_US", clear
destring gics6, replace
mmerge gics6 using "$tmp/gics6_industries", umatch(gics6_num) ukeep(firm_type) uname(ms_)
keep if _merge==3	
drop _merge
unique cusip6
rename gics6 ms_gics6
save "$tmp/industry_morningstar", replace

* ---------------------------------------------------------------------------------------------------------
* Industry data from Compustat-Capital IQ
* ---------------------------------------------------------------------------------------------------------

* Data from compustat and capital IQ
use "$raw/mns_industry/compustat_sic_merge", clear
drop if cusip6 == "0"
gen _cusip6 = cusip6
destring _cusip6, force replace
gen str6 z = string(_cusip6,"%06.0f")
drop _cusip6
rename z _cusip6
order cusip6 _cusip6
replace _cusip6 = "" if _cusip6 == "."
replace cusip6 = _cusip6 if strlen(cusip6) < 6
drop if strlen(cusip6 ) > 6
drop _cusip6
bysort cusip6: keep if _n == 1
save "$tmp/industry_ciq_compustat", replace

* ---------------------------------------------------------------------------------------------------------
* Merge all the sources
* ---------------------------------------------------------------------------------------------------------

* Consolidate files at cusip6 level
use "$tmp/industry_factset", clear
mmerge cusip6 using "$tmp/industry_morningstar", unmatched(b)
mmerge cusip6 using "$tmp/industry_ciq_compustat", unmatched(b)
mmerge cusip6 using "$tmp/industry_dlg", unmatched(b)
drop if cusip6 == "#N/A N"
drop ciq_short *name* *ciqid*
drop *country* cusip
drop sic2 sic3
drop ms_firm_type firm_type
drop ciq_companytype
drop if ciq_ind_sic == 0
rename sic compustat_sic
keep cusip6 factset_naics ms_gics6 ciq_sic compustat_sic dlg_sic
destring dlg_sic, force replace
gen naics = factset_naics
gen gics = ms_gics6
gen sic = dlg_sic
replace sic = ciq_sic if missing(sic)
replace sic = compustat_sic if missing(sic) 
keep cusip6 naics gics sic
drop if missing(naics) & missing(gics) & missing(sic)
mmerge sic using "$tmp/sic_naics_crosswalk", unmatched(m) uname(x_)
preserve
keep if ~missing(sic) & missing(x_naics)
drop if missing(naics)
bysort sic naics: gen combinationCount = _N
bysort sic: egen maxCombinationCount = max(combinationCount)
keep if combinationCount == maxCombinationCount
bysort sic: keep if _n == 1
keep sic naics
rename naics x_naics
save "$tmp/sic_naics_crosswalk_supplement", replace
restore
mmerge sic using "$tmp/sic_naics_crosswalk_supplement", unmatched(m) update
replace naics = x_naics if missing(naics)
keep cusip6 naics gics 
save "$tmp/merged_industry_step1", replace

* Construct GICS-NAICS crosswalk
use "$tmp/merged_industry_step1", clear
keep if ~missing(gics) & ~missing(naics)
bysort gics naics: gen combinationCount = _N
bysort gics: egen maxCombinationCount = max(combinationCount)
keep if combinationCount == maxCombinationCount
drop cusip6
duplicates drop
bysort gics: keep if _n == 1
keep gics naics
save "$tmp/gics_naics_crosswalk_internal", replace

* ---------------------------------------------------------------------------------------------------------
* Industry master file for CUSIP6
* ---------------------------------------------------------------------------------------------------------

* Create industry master file
use "$tmp/merged_industry_step1", clear
mmerge gics using "$tmp/gics_naics_crosswalk_internal", unmatched(m) update
drop _merge
keep cusip6 naics
mmerge naics using "$tmp/naics_structure", unmatched(m)
replace naics_description = "Unclassified" if naics == 999999
drop _merge
rename naics_description naics_industry
tostring naics, gen(naics2)
replace naics2 = substr(naics2, 1, 2)
destring naics2, force replace
mmerge naics2 using "$tmp/naics_structure", umatch(naics) unmatched(m)
rename naics_description naics_sector
replace naics_sector = "Unclassified" if naics2 == 99
replace naics_sector = "Manufacturing" if inlist(naics2, 31, 32, 33)
replace naics_sector = "Retail Trade" if inlist(naics2, 44, 45)
replace naics_sector = "Transportation and Warehousing" if inlist(naics2, 48, 49)
replace naics_sector = subinstr(naics_sector, "sT", "s", .)
replace naics_sector = subinstr(naics_sector, "nT", "n", .)
replace naics_sector = subinstr(naics_sector, "eT", "e", .)
replace naics_sector = subinstr(naics_sector, "gT", "g", .)
replace naics_sector = subinstr(naics_sector, ")T", ")", .)
replace naics_sector = strltrim(naics_sector)
drop _merge
save "$tmp/industry_master_naics", replace

* SIC-based version
use "$tmp/industry_master_naics", clear
mmerge naics using "$tmp/naics_sic_crosswalk", unmatched(m)
drop _merge
drop if missing(sic)
gen sic1 = substr(sic, 1, 1)
gen sic2 = substr(sic, 1, 2)
gen sic_uf = 0
replace sic_uf = 1 if inlist(sic1, "6", "9") | inlist(sic2, "49")
save "$tmp/industry_master_sic", replace

cap log close
